{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "8204ce81",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "from pyspark.sql import Row\n",
    "from pyspark import SparkContext"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3e259151",
   "metadata": {},
   "outputs": [],
   "source": [
    "sc = SparkContext(\"local\",\"My app\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "bd16b070",
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8c94280e",
   "metadata": {},
   "outputs": [],
   "source": [
    "sqlContext = SparkSession.builder.getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "083b3fa1",
   "metadata": {},
   "outputs": [],
   "source": [
    "book_rdd = sc.textFile(\"file:///home/demos/实验数据/book.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7a80b211",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "54351"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "book_rdd.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ddc28563",
   "metadata": {},
   "outputs": [],
   "source": [
    "temp_rdd = book_rdd.map(lambda x:x.split(\",\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "5fe1baed",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['序号', '书名', '评分', '价格', '出版社', 'url'],\n",
       " ['5173',\n",
       "  '動力取向精神醫學--臨床應用與實務',\n",
       "  '10.0 ',\n",
       "  '1200元',\n",
       "  '心灵工坊',\n",
       "  'https://book.douban.com/subject/6053667/'],\n",
       " ['9929',\n",
       "  '水彩绘森活',\n",
       "  '10.0 ',\n",
       "  '29.8',\n",
       "  '人民邮电出版社',\n",
       "  'https://book.douban.com/subject/26115807/'],\n",
       " ['10124',\n",
       "  '殷周金文集成(修订增补本共8册)(精)',\n",
       "  '10.0 ',\n",
       "  '2400.00元',\n",
       "  '中华书局',\n",
       "  'https://book.douban.com/subject/2235855/'],\n",
       " ['16628',\n",
       "  '纸雕游戏大书',\n",
       "  '10.0 ',\n",
       "  '99.00元',\n",
       "  '重庆出版集团',\n",
       "  'https://book.douban.com/subject/26673804/']]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "temp_rdd.take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "25054707",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "book_rdd=temp_rdd.map(lambda x: Row(id=x[0],name=x[1],rating=x[2],price=x[3],publish=x[4],url=x[5]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "f8970911",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(id='序号', name='书名', rating='评分', price='价格', publish='出版社', url='url'),\n",
       " Row(id='5173', name='動力取向精神醫學--臨床應用與實務', rating='10.0 ', price='1200元', publish='心灵工坊', url='https://book.douban.com/subject/6053667/'),\n",
       " Row(id='9929', name='水彩绘森活', rating='10.0 ', price='29.8', publish='人民邮电出版社', url='https://book.douban.com/subject/26115807/'),\n",
       " Row(id='10124', name='殷周金文集成(修订增补本共8册)(精)', rating='10.0 ', price='2400.00元', publish='中华书局', url='https://book.douban.com/subject/2235855/'),\n",
       " Row(id='16628', name='纸雕游戏大书', rating='10.0 ', price='99.00元', publish='重庆出版集团', url='https://book.douban.com/subject/26673804/')]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "book_rdd.take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "6fe6966f",
   "metadata": {},
   "outputs": [],
   "source": [
    "header = book_rdd.first()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0648b4fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "book_rdd = book_rdd.filter(lambda row:row!=header)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "7a08c695",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Row(id='序号', name='书名', rating='评分', price='价格', publish='出版社', url='url')"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "header"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "e3dde07b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(id='5173', name='動力取向精神醫學--臨床應用與實務', rating='10.0 ', price='1200元', publish='心灵工坊', url='https://book.douban.com/subject/6053667/'),\n",
       " Row(id='9929', name='水彩绘森活', rating='10.0 ', price='29.8', publish='人民邮电出版社', url='https://book.douban.com/subject/26115807/'),\n",
       " Row(id='10124', name='殷周金文集成(修订增补本共8册)(精)', rating='10.0 ', price='2400.00元', publish='中华书局', url='https://book.douban.com/subject/2235855/'),\n",
       " Row(id='16628', name='纸雕游戏大书', rating='10.0 ', price='99.00元', publish='重庆出版集团', url='https://book.douban.com/subject/26673804/'),\n",
       " Row(id='19103', name='Michelangelo', rating='10.0 ', price='$200.00 ', publish='Taschen', url='https://book.douban.com/subject/2342660/')]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "book_rdd.take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "eb765558",
   "metadata": {},
   "outputs": [],
   "source": [
    "book_df = sqlContext.createDataFrame(book_rdd,[\"id\",\"rating\",\"price\",\"publish\",\"url\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "688f3c9d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(id='5173', rating='動力取向精神醫學--臨床應用與實務', price='10.0 ', publish='1200元', url='心灵工坊', url='https://book.douban.com/subject/6053667/'),\n",
       " Row(id='9929', rating='水彩绘森活', price='10.0 ', publish='29.8', url='人民邮电出版社', url='https://book.douban.com/subject/26115807/'),\n",
       " Row(id='10124', rating='殷周金文集成(修订增补本共8册)(精)', price='10.0 ', publish='2400.00元', url='中华书局', url='https://book.douban.com/subject/2235855/'),\n",
       " Row(id='16628', rating='纸雕游戏大书', price='10.0 ', publish='99.00元', url='重庆出版集团', url='https://book.douban.com/subject/26673804/'),\n",
       " Row(id='19103', rating='Michelangelo', price='10.0 ', publish='$200.00 ', url='Taschen', url='https://book.douban.com/subject/2342660/')]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "book_df.take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "faebec65",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- id: string (nullable = true)\n",
      " |-- rating: string (nullable = true)\n",
      " |-- price: string (nullable = true)\n",
      " |-- publish: string (nullable = true)\n",
      " |-- url: string (nullable = true)\n",
      " |-- url: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "book_df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "0ae57709",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+-----------------------------------+-----+---------+------------------+--------------------+\n",
      "|   id|                             rating|price|  publish|               url|                 url|\n",
      "+-----+-----------------------------------+-----+---------+------------------+--------------------+\n",
      "| 5173|   動力取向精神醫學--臨床應用與實務|10.0 |   1200元|          心灵工坊|https://book.doub...|\n",
      "| 9929|                         水彩绘森活|10.0 |     29.8|    人民邮电出版社|https://book.doub...|\n",
      "|10124|  殷周金文集成(修订增补本共8册)(精)|10.0 |2400.00元|          中华书局|https://book.doub...|\n",
      "|16628|                       纸雕游戏大书|10.0 |  99.00元|      重庆出版集团|https://book.doub...|\n",
      "|19103|                       Michelangelo|10.0 | $200.00 |           Taschen|https://book.doub...|\n",
      "|20063|                  一支笔的快乐涂鸦2|10.0 |     29.8|    人民邮电出版社|https://book.doub...|\n",
      "|32781|                         亲亲宝贝装|10.0 |  28.00元|江西科学技术出版社|https://book.doub...|\n",
      "|32879|                     Photoshop7解像|10.0 |  68.00元|        海洋出版社|https://book.doub...|\n",
      "|45687|                   戚蓼生序本石头记|10.0 | 350.00元|    人民文学出版社|https://book.doub...|\n",
      "|52504|                      宇宙兄弟（7）|10.0 |   JPY580|            講談社|https://book.doub...|\n",
      "|52505|                      宇宙兄弟（8）|10.0 |   JPY580|            講談社|https://book.doub...|\n",
      "|  573|            TCP\\IP详解（卷1英文版）| 9.9 |       45|    机械工业出版社|https://book.doub...|\n",
      "|  589|计算机程序设计艺术卷1：基本算法(...| 9.9 | 119.00元|    人民邮电出版社|https://book.doub...|\n",
      "| 5522|         微积分和数学分析引论-第1卷| 9.9 |  79.00元|  世界图书出版公司|https://book.doub...|\n",
      "| 5547|               PrinciplesofNeura...| 9.9 | $103.41 |McGraw-HillMedical|https://book.doub...|\n",
      "| 7443|           奈特人体神经解剖彩色图谱| 9.9 | 138.00元|    人民卫生出版社|https://book.doub...|\n",
      "| 8703|                 数学、科学和认识论| 9.9 |  32.00元|        商务印书馆|https://book.doub...|\n",
      "| 9924|                       零基础学素描| 9.9 |     20元|    人民邮电出版社|https://book.doub...|\n",
      "| 9926|     黑白花意3：300例超写实的花之绘| 9.9 |  29.80元|    人民邮电出版社|https://book.doub...|\n",
      "| 9927|         黑白画意：经典植物手绘教程| 9.9 |  29.80元|    人民邮电出版社|https://book.doub...|\n",
      "+-----+-----------------------------------+-----+---------+------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "book_df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9e98b68",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
